In [1]:
# necessary imports
import os
import pandas as pd
import numpy as np
import glob
import plotly.express as px
from scipy import signal
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../scripts/')
from kolzur_filter import kz_filter
In [2]:
# set to *.csv to process all
path_to_csv = '../../data/BTW17_Twitter/hashtags/*.csv'
file_list = glob.glob(path_to_csv)

df = pd.DataFrame()

# concatenate dataframes
for index in tqdm(range(len(file_list))):
    file = file_list[index]
    
    # read file to df if first file
    if index==0:
        df = pd.read_csv(file)
        df.drop('Unnamed: 0', axis=1, inplace=True)
    
    # append to df if not first file
    else:
        df2 = pd.read_csv(file)
        df2.drop('Unnamed: 0', axis=1, inplace=True)
        df = df.append(df2)

df = df.groupby(['date', 'hashtag'], as_index=False).sum('count')
df.describe(include='all')

Out[2]:
date hashtag count
count 99167 99167 99167.000000
unique 120 30595 NaN
top 2017-09-24 cdu NaN
freq 1386 120 NaN
mean NaN NaN 10.305565
std NaN NaN 84.494900
min NaN NaN 1.000000
25% NaN NaN 1.000000
50% NaN NaN 1.000000
75% NaN NaN 4.000000
max NaN NaN 9412.000000
In [3]:
# plot top 25 hashtags
top25 = df[['hashtag','count']].groupby('hashtag', as_index=False).sum('count').nlargest(columns='count', n=25)
df.sort_values(by='date', inplace=True)
fig1 = px.line(df[df['hashtag'].isin(top25['hashtag'])], x='date', y='count', color='hashtag', title='top25 hashtags',
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig1.show()
In [4]:
def plot_peak_detection(hashtag, k):
    wavelets = df[df['hashtag']==hashtag][['date', 'count']]
    filtered_wavelets = [0] * len(wavelets)
    half_k = int(k/2)
    filtered_wavelets[half_k:-half_k] = kz_filter(wavelets['count'].to_numpy(), k, 1)
    wavelets['filtered_count'] = filtered_wavelets
    results_prom = []
    for i in range(1,11):
        peakind_loop = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/i))
        prominences = signal.peak_prominences(filtered_wavelets, peakind_loop)
        results_prom.append(prominences[0].mean())
        
    id_max_prom = results_prom.index(max(results_prom)) + 1
    peakind = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/id_max_prom))
    fig = px.line(wavelets, x='date', y=['count', 'filtered_count'], title=hashtag,
                  template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
    
    for item in peakind:
        peak = wavelets['date'].tolist()[item]
        fig.add_vrect(x0=str(datetime.strptime(peak, '%Y-%m-%d').date() - timedelta(days=3)),
                      x1=str(datetime.strptime(peak, '%Y-%m-%d').date() + timedelta(days=3)),
                      line_width=0,
                      fillcolor='grey',
                      opacity=0.2)
    print(f'\nmean peak prom: {prominences[0].mean()}')
    fig.show()
In [5]:
for hashtag in top25['hashtag']:
    plot_peak_detection(hashtag, 7)
mean peak prom: 70.79591836734699
mean peak prom: 76.17142857142854
mean peak prom: 68.3035714285714
mean peak prom: 3.3214285714285694
mean peak prom: 42.10204081632653
mean peak prom: 16.66666666666666
mean peak prom: 0.0
mean peak prom: 428.5714285714285
mean peak prom: 16.142857142857146
mean peak prom: 272.39285714285717
mean peak prom: 5.163265306122449
mean peak prom: 0.0
mean peak prom: 1.2448979591836735
mean peak prom: 17.571428571428573
mean peak prom: 2.542857142857143
mean peak prom: 3.1190476190476186
mean peak prom: 6.3265306122448965
mean peak prom: 29.61904761904762
mean peak prom: 15.938775510204081
mean peak prom: 36.38775510204081
mean peak prom: 4.057142857142856
mean peak prom: 15.57142857142857
mean peak prom: 0.0
mean peak prom: 23.71428571428571
mean peak prom: 3.053571428571428
In [6]:
def peak_detection(hashtag, k):
    wavelets = df[df['hashtag']==hashtag][['date', 'count']]
    filtered_wavelets = [0] * len(wavelets)
    half_k = int(k/2)
    filtered_wavelets[half_k:-half_k] = kz_filter(wavelets['count'].to_numpy(), k, 1)
    wavelets['filtered_count'] = filtered_wavelets
    results_prom = []
    for i in range(1,11):
        peakind_loop = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/i))
        prominences = signal.peak_prominences(filtered_wavelets, peakind_loop)
        results_prom.append(prominences[0].mean())
        
    id_max_prom = results_prom.index(max(results_prom)) + 1
    peakind = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/id_max_prom))
    
    return(peakind)
In [7]:
peak_df = pd.DataFrame(columns=['peak', 'hashtag'])
hashtag_list = df['hashtag'].unique().tolist()

# remove incomplete time series
num_days = df['date'].nunique()/2 # min 60 days
complete_hashtags = []
for index in tqdm(range(len(hashtag_list))):
    hashtag = hashtag_list[index]
    if df[df['hashtag']==hashtag]['date'].nunique() >= num_days:
        complete_hashtags.append(hashtag)
    df_clean = df[df['hashtag'].isin(complete_hashtags)]
    
# get peak indices
for index in tqdm(range(len(complete_hashtags))):
    hashtag = complete_hashtags[index]
    results = peak_detection(hashtag, 7)
    if index == 1:
        peak_df['peak'] = pd.Series(results)        
        peak_df['hashtag'] = hashtag
    else:
        for item in results:
            peak_df = peak_df.append({'peak': item, 'hashtag': hashtag}, ignore_index=True)
            
peak_df.dropna(inplace=True)


In [9]:
# save to csv
path_file = '../../data/BTW17_Twitter/peaks/peaks.csv'
peak_df.to_csv(path_file)